# Computations
import numpy as np
import pandas as pd
# scipy
import scipy.stats as stats
from scipy.stats import norm
# preprocessing
from sklearn.preprocessing import StandardScaler
# sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score,\
KFold, StratifiedShuffleSplit
from sklearn.feature_selection import RFE
from sklearn.utils.fixes import loguniform
from sklearn.tree import DecisionTreeClassifier
# Visualisation libraries
## Progress Bar
import progressbar
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze and predict customer churn for Telco Customer Churn data.
| Columns | Description |
|---|---|
| customerID | Customer ID |
| gender | Whether the customer is a male or a female |
| SeniorCitizen | Whether the customer is a senior citizen or not (1, 0) |
| Partner | Whether the customer has a partner or not (Yes, No) |
| Dependents | Whether the customer has dependents or not (Yes, No) |
| tenure | Number of months the customer has stayed with the company |
| PhoneService | Whether the customer has a phone service or not (Yes, No) |
| MultipleLines | Whether the customer has multiple lines or not (Yes, No, No phone service) |
| InternetService | Customer’s internet service provider (DSL, Fiber optic, No) |
| OnlineSecurity | Whether the customer has online security or not (Yes, No, No internet service) |
| OnlineBackup | Whether the customer has an online backup or not (Yes, No, No internet service) |
| DeviceProtection | Whether the customer has device protection or not (Yes, No, No internet service) |
| TechSupport | Whether the customer has tech support or not (Yes, No, No internet service) |
| StreamingTV | Whether the customer has streaming TV or not (Yes, No, No internet service) |
| StreamingMovies | Whether the customer has streaming movies or not (Yes, No, No internet service) |
| Contract | The contract term of the customer (Month-to-month, One year, Two years) |
| PaperlessBilling | Whether the customer has paperless billing or not (Yes, No) |
| PaymentMethod | The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)) |
| MonthlyCharges | The amount charged to the customer monthly |
| TotalCharges | The total amount charged to the customer |
| Churn | Whether the customer churned or not (Yes or No) |
Data = pd.read_csv('telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn_clean.csv')
df = Data.drop(columns = ['customer ID'])
Target = 'Churn'
Let's take a look at the variance of the features.
fig, ax = plt.subplots(figsize=(17,16))
Temp = df.drop(columns = [Target]).var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax, annot=True, square=True, cmap =sns.color_palette("OrRd", 20),
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0],
cbar_kws={'label': 'Feature Variance', "aspect":40, "shrink": .4, "orientation": "horizontal"})
labels = [x.replace(' ','\n').replace('Euribor\nthree','Euribor three').replace('\nof\n',' of\n')
for x in [item.get_text() for item in ax.get_xticklabels()]]
_ = ax.set_xticklabels(labels)
_ = ax.set_yticklabels('')
Furthermore, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().
# Scaling
Temp = df.drop(columns = Target).columns.tolist()
scaler = StandardScaler()
_ = scaler.fit(df[Temp])
df[Temp] = scaler.transform(df[Temp])
# Variance Plot
Fig, ax = plt.subplots(figsize=(17,16))
Temp = df.drop(columns = [Target]).var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax, annot=True, square=True, cmap =sns.color_palette('Greens'),
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0],
cbar_kws={'label': 'Feature Variance', "aspect":40, "shrink": .4, "orientation": "horizontal"})
labels = [x.replace(' ','\n').replace('Euribor\nthree','Euribor three').replace('\nof\n',' of\n')
for x in [item.get_text() for item in ax.get_xticklabels()]]
_ = ax.set_xticklabels(labels)
_ = ax.set_yticklabels('')
df.to_csv (r'telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn_clean_STD.csv', index = None, header=True)
First, consider the data distribution for Term Deposit Subscription.
def Header(Text, L = 100, C1 = Back.BLUE, C2 = Fore.BLUE):
print(C1 + Fore.WHITE + Style.NORMAL + Text + Style.RESET_ALL + ' ' + C2 +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = Fore.BLUE): print(C + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
def Table1(Inp, Feat = Target):
Out = Inp[Feat].value_counts().to_frame('Number of Instances').reset_index()
Out = Out.rename(columns = {'index': Feat})
Out['Percentage'] = np.round(100* Out['Number of Instances'].values /Out['Number of Instances'].sum(), 2)
return Out
def Plot1(data, x= 'Number of Instances', Feat = Target, CL = ['FireBrick', 'SeaGreen']):
fig = px.bar(data, x= x, y= ['',''], orientation='h', color = Feat, text = 'Percentage',
color_discrete_sequence = CL, height = 180)
fig.update_layout(plot_bgcolor= 'white', legend_orientation='h', legend=dict(x=0, y=1.7),
xaxis = dict(tickmode = 'array', tickvals = [0, Data.shape[0]], ticktext = ['','']))
fig.update_traces(marker_line_color= 'Black', marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}% ', textposition='inside')
fig.update_xaxes(title_text=None, range=[0, Data.shape[0]])
fig.update_yaxes(title_text=None)
fig.show()
Temp = Table1(Data, Feat = Target)
Temp[Target] = Temp[Target].map({0:'No',1:'Yes'})
Plot1(data = Temp)
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
X = df.drop(columns = Target).values
y = df[Target].values
Labels = ['No', 'Yes']
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
del sss
Header('Train Set')
Temp = Table1(df, Feat = Target)
_, Temp['Number of Instances'] = np.unique(y_train, return_counts=True)
Temp['Percentage'] = np.round(100* Temp['Number of Instances'].values /Temp['Number of Instances'].sum(), 2)
Temp[Target] = Temp[Target].map(lambda x: Labels[0] if x ==0 else Labels[1])
display(Temp.style.format({'Percentage': "{:.2f}"}).hide_index())
Header('Test Set')
Temp = Table1(df, Feat = Target)
_, Temp['Number of Instances'] = np.unique(y_test, return_counts=True)
Temp['Percentage'] = np.round(100* Temp['Number of Instances'].values /Temp['Number of Instances'].sum(), 2)
Temp[Target] = Temp[Target].map(lambda x: Labels[0] if x ==0 else Labels[1])
display(Temp.style.format({'Percentage': "{:.2f}"}).hide_index())
del Temp
Line()
Train Set ==========================================================================================
| Churn | Number of Instances | Percentage |
|---|---|---|
| No | 3622 | 73.47 |
| Yes | 1308 | 26.53 |
Test Set ===========================================================================================
| Churn | Number of Instances | Percentage |
|---|---|---|
| No | 1552 | 73.45 |
| Yes | 561 | 26.55 |
====================================================================================================
Decision Tree Classifier uses a decision tree (as a predictive model) to go from observations about an item (represented in the branches) to conclusions about the item's target value (represented in the leaves).
def Grid_Table(grid):
Temp = [str(x) for x in grid.cv_results_['params']]
Temp = [s.replace('{', '').replace('}', '').replace("'", '') for s in Temp]
Table = pd.DataFrame({'rank_test_score': grid.cv_results_['rank_test_score'],
'params':Temp,
'mean_test_score': grid.cv_results_['mean_test_score'],
'mean_fit_time': grid.cv_results_['mean_fit_time']})
Table = Table.round(4).sort_values('rank_test_score').set_index('rank_test_score')
return Table
def Grid_Performance_Plot(Table):
font = FontProperties()
font.set_weight('bold')
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
Z = zip(axes, ['mean_test_score', 'mean_fit_time'], ['Blue', 'Red'],['Classification Accuracy', 'Fit Time (with caching)'])
for ax, col, c, title in Z:
_ = ax.errorbar(x = Table['params'], y = Table[col], yerr = Table[col], color = c)
_ = ax.set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = ax.set_ylim(bottom = 0)
_ = ax.set_xlabel('Paramerers')
_ = ax.set_title(title, fontproperties=font, fontsize = 14)
grid_dtc = RandomizedSearchCV(DecisionTreeClassifier(),
{'criterion':['gini','entropy'], 'max_depth': np.arange(2,14)},
cv = KFold(n_splits = X.shape[1], shuffle = True),
n_iter = 30,
scoring = 'roc_auc',
error_score = 0,
verbose = 3,
n_jobs = 10,
refit = True)
_ = grid_dtc.fit(X_train, y_train)
clear_output()
display(pd.DataFrame({'Best Score': [grid_dtc.best_score_], 'Best Paramerers': [str(grid_dtc.best_params_)],
'Accuracy': [grid_dtc.score(X_test,y_test)]}).round(4).style.hide_index().set_precision(4))
Table = Grid_Table(grid_dtc)
display(Table.reset_index(drop = False).head(10).style.hide_index().\
set_precision(4).background_gradient(subset= ['mean_test_score'], cmap='Greens').\
background_gradient(subset= ['mean_fit_time'], cmap='Oranges'))
Grid_Performance_Plot(Table)
| Best Score | Best Paramerers | Accuracy |
|---|---|---|
| 0.8360 | {'max_depth': 5, 'criterion': 'gini'} | 0.8299 |
| rank_test_score | params | mean_test_score | mean_fit_time |
|---|---|---|---|
| 1 | max_depth: 5, criterion: gini | 0.8360 | 0.0092 |
| 2 | max_depth: 4, criterion: gini | 0.8335 | 0.0079 |
| 3 | max_depth: 5, criterion: entropy | 0.8315 | 0.0113 |
| 4 | max_depth: 4, criterion: entropy | 0.8299 | 0.0093 |
| 5 | max_depth: 6, criterion: gini | 0.8267 | 0.0109 |
| 6 | max_depth: 6, criterion: entropy | 0.8250 | 0.0129 |
| 7 | max_depth: 3, criterion: gini | 0.8202 | 0.0063 |
| 8 | max_depth: 3, criterion: entropy | 0.8193 | 0.0075 |
| 9 | max_depth: 7, criterion: entropy | 0.8134 | 0.0145 |
| 10 | max_depth: 7, criterion: gini | 0.8094 | 0.0120 |
Now, we can develop a model using the best parameters for the DTC. Here, we use 10 stratified randomized folds made by preserving the percentage of samples for each class.
n_splits = 10
sss = StratifiedShuffleSplit(n_splits = n_splits, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
Reports_Train = []
Reports_Test = []
CM_Train = []
CM_Test = []
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
dtc = DecisionTreeClassifier(**grid_dtc.best_params_)
_ = dtc.fit(X_train,y_train)
# Train
y_pred = dtc.predict(X_train)
R = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=Labels, output_dict=True)).T
Reports_Train.append(R.values)
CM_Train.append(metrics.confusion_matrix(y_train, y_pred))
# Test
y_pred = dtc.predict(X_test)
R = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=Labels, output_dict=True)).T
Reports_Test.append(R.values)
CM_Test.append(metrics.confusion_matrix(y_test, y_pred))
# Train
ALL = Reports_Train[0].ravel()
CM = CM_Train[0].ravel()
for i in range(1, len(Reports_Train)):
ALL = np.vstack((ALL, Reports_Train[i].ravel()))
CM = np.vstack((CM, CM_Train[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Train = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Train = CM.mean(axis = 0).reshape(CM_Train[0].shape).round(0).astype(int)
del ALL, Mean, STD
# Test
ALL = Reports_Test[0].ravel()
CM = CM_Test[0].ravel()
for i in range(1, len(Reports_Test)):
ALL = np.vstack((ALL, Reports_Test[i].ravel()))
CM = np.vstack((CM, CM_Test[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Test = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Test = CM.mean(axis = 0).reshape(CM_Test[0].shape).round(0).astype(int)
del ALL, Mean, STD
Reports_Train.index.name = 'Train Set (CV = % i)' % n_splits
Reports_Test.index.name = 'Test Set (CV = % i)' % n_splits
display(Reports_Train)
display(Reports_Test)
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| Test Set (CV = 10) | ||||
| No | 0.8561 ± 0.0168 | 0.8779 ± 0.0226 | 0.8664 ± 0.0024 | 3622.0000 ± 0.0000 |
| Yes | 0.6379 ± 0.0197 | 0.5890 ± 0.0690 | 0.6090 ± 0.0329 | 1308.0000 ± 0.0000 |
| accuracy | 0.8012 ± 0.0027 | 0.8012 ± 0.0027 | 0.8012 ± 0.0027 | 0.8012 ± 0.0027 |
| macro avg | 0.7470 ± 0.0032 | 0.7334 ± 0.0233 | 0.7377 ± 0.0156 | 4930.0000 ± 0.0000 |
| weighted avg | 0.7983 ± 0.0076 | 0.8012 ± 0.0027 | 0.7981 ± 0.0076 | 4930.0000 ± 0.0000 |
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| Test Set (CV = 10) | ||||
| No | 0.8504 ± 0.0149 | 0.8653 ± 0.0267 | 0.8573 ± 0.0071 | 1552.0000 ± 0.0000 |
| Yes | 0.6110 ± 0.0274 | 0.5766 ± 0.0640 | 0.5899 ± 0.0296 | 561.0000 ± 0.0000 |
| accuracy | 0.7887 ± 0.0076 | 0.7887 ± 0.0076 | 0.7887 ± 0.0076 | 0.7887 ± 0.0076 |
| macro avg | 0.7307 ± 0.0098 | 0.7210 ± 0.0203 | 0.7236 ± 0.0141 | 2113.0000 ± 0.0000 |
| weighted avg | 0.7869 ± 0.0076 | 0.7887 ± 0.0076 | 0.7863 ± 0.0080 | 2113.0000 ± 0.0000 |
Similarly, for Confusion Matrix, we take the average of the confusion matrices of all folds.
# Font
font = FontProperties()
font.set_weight('bold')
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
fig.suptitle(Titles[i], fontproperties=font, fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": 1})
_ = ax[0].set_title('Confusion Matrix');
_ = sns.heatmap(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis],
annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(Labels)
_ = a.yaxis.set_ticklabels(Labels)
_ = a.set_aspect(1)
Note that:
where $T_p$, $T_n$, $F_p$, and $F_n$ represent true positive, true negative, false positive, and false negative, respectively.
However, the accuracy can be a misleading metric for imbalanced data sets. Here, over 88 percent of the sample has negative (No) and about 12 percent has positive (Yes) values. In these cases, a balanced accuracy (bACC) [4] is recommended that normalizes true positive and true negative predictions by the number of positive and negative samples, respectively, and divides their sum by two:
\begin{align} \text{TPR} &= \frac{T_p}{T_p + F_n},\\ \text{TNR} &= \frac{T_N}{T_p + F_p},\\ \text{Balanced Accuracy (bACC)} &= \frac{1}{2}\left(\text{TPR}+\text{TNR}\right) \end{align}Header('Train Set')
tn, fp, fn, tp = CM_Train.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Train) = %.2f' % Precision)
print('Recall (Train) = %.2f' % Recall)
print('TPR (Train) = %.2f' % TPR)
print('TNR (Train) = %.2f' % TNR)
print('Balanced Accuracy (Train) = %.2f' % BA)
Header('Test Set')
tn, fp, fn, tp = CM_Test.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Test) = %.2f' % Precision)
print('Recall (Test) = %.2f' % Recall)
print('TPR (Test) = %.2f' % TPR)
print('TNR (Test) = %.2f' % TNR)
print('Balanced Accuracy (Test) = %.2f' % BA)
del tn, fp, fn, tp, Precision, Recall, TPR, TNR, BA, PPCR
Line()
Train Set ========================================================================================== Precision (Train) = 0.64 Recall (Train) = 0.59 TPR (Train) = 0.59 TNR (Train) = 0.88 Balanced Accuracy (Train) = 0.73 Test Set =========================================================================================== Precision (Test) = 0.61 Recall (Test) = 0.58 TPR (Test) = 0.58 TNR (Test) = 0.87 Balanced Accuracy (Test) = 0.72 ====================================================================================================
To improve the results, we can implement an iterative optimization deep learning method.